Import data and preprocess
Contents
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from urllib.request import urlopen
import json
import geopandas as gpd
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_35440/3083711657.py in <module>
6 from urllib.request import urlopen
7 import json
----> 8 import geopandas as gpd
ModuleNotFoundError: No module named 'geopandas'
Import data and preprocess¶
data = pd.read_csv("NYCgov_Poverty_Measure_Data__2015_.csv")
features = ['SERIALNO', 'SPORDER', 'AGEP', 'CIT', 'REL', 'SCH',
'SCHG', 'SCHL', 'SEX', 'ESR', 'LANX', 'ENG', 'MSP',
'WKHP', 'DIS', 'JWTR', 'NP', 'TEN', 'HHT', 'AgeCateg', 'Boro',
'CitizenStatus', 'EducAttain', 'Ethnicity', 'FamType_PU', 'FTPTWork',
'INTP_adj', 'MRGP_adj', 'NYCgov_Income', 'NYCgov_Pov_Stat', 'NYCgov_REL',
'NYCgov_Threshold', 'Off_Pov_Stat', 'Off_Threshold', 'OI_adj', 'PA_adj',
'Povunit_ID', 'Povunit_Rel', 'PreTaxIncome_PU', 'RETP_adj', 'RNTP_adj',
'SEMP_adj', 'SSIP_adj', 'SSP_adj', 'TotalWorkHrs_PU', 'WAGP_adj']
#Recode = code in dictionary
# CIT: Citenzenship
# REL: is relationship ie. Daughter, Son, etc. is ACS code ()
# SCH, SCHG: (SCHG is ACS code) for educaiton
# SCHL: Education attainment ACS code
# ESR: Employement status (code in dictionary file)
# LANX: language other than language spoken
# ENG: ability to speak english
# MSP: Married or not (code in dictionary file)
# MAR: Marital status
# WKHP: huors work per week
# DIS: disability (Recode)
# JWTR: transportation to work (ACS)
# NP: number of people in household
# TEN: Housing tenure
# FamType_PT: PovertyUnit familytype (umiddelbart fjerne)
# FTPTWork: work experience (recode)
# INTP_adj: Income adjusted
# MRGP_adj: Morgage amount adjusted
# SEMP_adj: self employed
# SSIP_adh: supplementary income
# SSP_adj: social socurity income (people who are disabled)
# WAGP_adj: Wages
Visulization¶
Number of healty tree in each district
Probablity of healthy tree in each district
histogram of diameter
histogram of depth
plot of location for trees (heatmap)
X = data[features]
i = 1
plt.figure(figsize=(25, 25))
for feature in features[1:]:
plt.subplot(4, 4, i)
if feature == 'spc_common':
g = sns.scatterplot(x='spc_common',y='tree_dbh',hue='health',data=X)
g.set_xticks(np.arange(0,name_fact[0].max(),20)) # <--- set the ticks first
g.set_xticklabels(np.arange(0,name_fact[0].max(),20))
else:
sns.scatterplot(x=feature,y='tree_dbh',hue='health',data=X)
i += 1
X_grouped = X.groupby(['Boro']).median()
gdf = gpd.read_file('https://raw.githubusercontent.com/dwillis/nyc-maps/master/boroughs.geojson')
gdf.to_crs(epsg=4326, inplace=True)
gdf.set_index('BoroName', inplace=True)
gdf['BoroCode'] = [5,4,2,3,1]
gdf.sort_index(inplace=True)
X_grouped['BoroName'] = ['Bronx','Brooklyn','Manhattan','Queens','Staten Island']
X_grouped.set_index('BoroName',inplace=True)
att = 'PreTaxIncome_PU'
fig = px.choropleth_mapbox(X_grouped[att], geojson=gdf['geometry'], locations=gdf.index, color=X_grouped[att],
color_continuous_scale="Viridis",
range_color=(X_grouped[att].min(),X_grouped[att].max()),
mapbox_style="carto-positron",
zoom=8.9, center = {"lat": 40.730610, "lon": -73.935242},
opacity=0.5,
labels={att:att}
)
fig.update_layout(margin={"r":300,"t":100,"l":200,"b":0})
fig.show("notebook")
/Users/haraldskat-rordam/opt/anaconda3/lib/python3.7/site-packages/google/colab/data_table.py:30: UserWarning:
IPython.utils.traitlets has moved to a top-level traitlets package.
X_grouped[[att,'EducAttain']]
| PreTaxIncome_PU | EducAttain | |
|---|---|---|
| BoroName | ||
| Bronx | 42553.719 | 2.0 |
| Brooklyn | 60075.840 | 2.0 |
| Manhattan | 75094.797 | 4.0 |
| Queens | 70288.734 | 2.0 |
| Staten Island | 90113.758 | 2.0 |
attributes in ML model¶
NP: Number of people in house hold
Race
Sex
Boro
Age
LANX: language other than language spoken
DIS: disability (Recode)